########################################################################
################    TEXT ANALYSIS VOX PRESS RELEASE   ##################
########################################################################
# BY: ALICE TIANBO ZHANG (alice.tianbo.zhang@gmail.com)
# UPDATED: 2021/11/1

#### LOAD PACKAGE ####
#packages <- c("tm", "plyr", "ggplot2", "wordCloud", "RColorBrewer", "SnowballC","tm.plugin.webmining","stm")
#install.packages(packages)
library(tm)
library(plyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(tm.plugin.webmining)
library(stm)

#### SET DIRECTORY ####
rm(list = ls()) 
setwd("~/Dropbox/Research_Columbia/Renewables Voting (Urpelainen Zhang)/JOP/UZ_JOP2021_Replication")

#### LOAD DATA ####
full_data <- read.csv("Data/Final/vox_district_year_cleaned.csv", header = T, sep = ",", strip.white = T, stringsAsFactors = F)
full_data <- full_data[complete.cases(full_data$text),]

#### PRE-PROCESS DATA ####
# Remote HTML
full_data$text <- gsub("<.*?>", "", full_data$text)
full_data$text <- gsub("&nbsp", "", full_data$text)

# Removes stopwords, numbers, punctuation, html, and then convert to lower case and stem words
processed <- textProcessor(documents = full_data$text, metadata = full_data, striphtml = TRUE)

# Plot the number of words and documents removed for different thresholds
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))

# Remove infrequent words (words that do not appear in at least 10 documents were removed)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 10)

# Create file structure for STM package
docs <- out$documents # word indices and associated counts
vocab <- out$vocab # words associated with the word indices
meta <- out$meta # matrix with document covariates
meta$party <- as.factor(meta$party)


#### STRUCTURAL TOPIC MODEL - ESTIMATION ####
# Metadata (covariates) affects either: 
#  1) topic prevalence: how much each topic contributes to a document; or
#  2) topic content: 

## Model initiation w/t spectral (Model: Party * Cumulative Capacity/Count + Flexible trend in year)
# Option "spectral" ensures that the same results are generated each time
# Converged in iteration 60
stm_prev2 <- stm(docs, vocab, K = 30, prevalence = ~ party * cum_capacity_turbine + s(year), 
                 max.em.its = 200, data = meta, init.type = "Spectral", control=list(recoverEG=FALSE), seed = 123456)


# ## Model search w/t algorithm from Lee and Mimno (2014)
# Note: This is not a preferred model because the algorithm is not deterministic and produces wildly different results. 
# For details: Roberts, Margaret E., Brandon M. Stewart, and Dustin Tingley. 2019. “Stm: An R Package for Structural Topic Models.” Journal of Statistical Software 91 (2): 1–40.

# stm_prev3 <- stm(docs, vocab, K = 0, prevalence = ~ party * cum_capacity_turbine + s(year),
#                  max.em.its = 200, data = meta, init.type = "Spectral", control=list(recoverEG=FALSE), seed = 123456)
# 

#### STRUCTURAL TOPIC MODEL - INTERPRETATION ####

##  Check the words associated with each topic
# Words associated with each topic 
pdf("Results/Figures/STM/stmPrev2_wordLabels_fixedSeed.pdf")
plot.STM(stm_prev2, type = "labels", n = 10, topics = c(1:10), width = 150, xlim = c(-100,150), text.cex = 0.79)
dev.off()

# Proportion of documents that belong to each topic
pdf("Results/Figures/STM/stmPrev2_topicProportion_fixedSeed.pdf")
plot.STM(stm_prev2, type = "summary", n = 5, xlim = c(0, 0.3))
dev.off()

# Check the topic words associated with each topic
labelTopics(stm_prev2)

# Word Cloud
for (i in 1:30){
  pdf(paste0("Results/Figures/STM/stmPrev2_cloud_topic", i, "_fixedSeed.pdf"))
  cloud(stm_prev2, topic = i)
  dev.off()
}

# ## Check documents associated with each topic
# M2thoughts1 <- findThoughts(stm_prev2, texts = meta$text, n = 1, topics = 1)$docs[[1]]
# M2thoughts2 <- findThoughts(stm_prev2, texts = meta$text, n = 1, topics = 9)$docs[[1]]
# 
# par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
# plotQuote(M2thoughts1, width = 30, main = "Topic 1")
# plotQuote(M2thoughts2, width = 30, main = "Topic 9", text.cex = 0.5)

## Check metadata and topic relationship
prep2 <- estimateEffect(1:30 ~ party * cum_capacity_turbine + s(year), stm_prev2, 
                        meta = meta, uncertainty = "None")

# plot.estimateEffect(prep2, covariate = "party", topics = c(1:30), 
#                     model = stm_prev2, method = "difference", 
#                     cov.value1 = "D", cov.value2 = "R", 
#                     xlab = "Republican    <---->    Democrat", 
#                     main = "Effect of Partisanship: Democrat vs. Republican", 
#                     xlim = c(-0.23, 0.23), labeltype = "prob", n = 1, verbose.labels = F)
          
# Effect of partisanship on topic : Compare D to R
topicNames <- c("Wind Projects", "Oil & Gas Price", "Congressman Murphy", "Energy Independence", "Disaster", 
                "Iraq War", "Tax Credit", "Public Program Funding", "Water Resources", "New Projects",
                "Land Resources", "Trade (China)", "Jobs", "Hurricane", "Disaster", 
                "Oil Drilling", "Renewable Energy", "Environmental Protection", "Policy (US)", "Congressman Sestak",
                "Budget", "Nuclear (Japan)", "Funding (California)", "Hurricane (NY/NJ)", "Public Program Funding", 
                "Energy Efficiency", "Clean Energy", "Events", "Climate Change", "R&D")

pdf("Results/Figures/STM/stmPrev2_effectParty_fixedSeed.pdf", width = 12, height = 10)
plot.estimateEffect(prep2, covariate = "party", topics = c(1:30), 
                    model = stm_prev2, method = "difference", 
                    cov.value1 = "D", cov.value2 = "R", 
                    xlab = "Republican    <------->    Democrat", 
                    main = "Effect of Partisanship: Democrat vs. Republican", 
                    xlim = c(-0.23, 0.23), cex = 0.52, labeltype = "custom", 
                    custom.labels = topicNames)
dev.off()

## Covariate interactions
# Fit a model of only one topic and loop through all topics
for (i in 1:30){
  prep2 <- estimateEffect(c(i) ~ party * cum_capacity_turbine + s(year), stm_prev2, 
                          meta = meta, uncertainty = "None")
  
  pdf(paste0("Results/Figures/STM/stmPrev2_effectCumCapaParty_topic", i, "_fixedSeed.pdf"))
  plot.estimateEffect(prep2, covariate = "cum_capacity_turbine",
                      model = stm_prev2, method = "continuous", xlab = "Cumulative Capacity (mw)",
                      moderator = "party", moderator.value = "D", linecol = "blue", printlegend = F,
                      ylim = c(0, 0.4), main = paste0("Topic - ", topicNames[[i]]))
  
  plot.estimateEffect(prep2, covariate = "cum_capacity_turbine",
                      model = stm_prev2, method = "continuous", xlab = "Cumulative Capacity (mw)",
                      moderator = "party", moderator.value = "R", linecol = "red", add = T, printlegend = F)
  legend("topleft", c("Democrat", "Republican"), lwd = 2, col = c("blue", "red"))
  dev.off()
}


